firstly we import the libraries that will be used in our project¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

now we load the csv file in our jupyter notebook and we use the head command to see the upper 5 rows¶

In [2]:
data=pd.read_csv("PEP1.csv")
data.head()
Out[2]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

now we have to know the shape of the data set with this we will now how many rows and columns in the data set¶

In [3]:
data.shape
Out[3]:
(1460, 81)

so now we have to find how many columns in the data set¶

In [4]:
data.columns
Out[4]:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchebvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functiol', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal',
       'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice'],
      dtype='object')
In [5]:
df=data.columns

so now we find the uniqueness of the columns i.e how many other value or data in one column¶

In [6]:
data[df].nunique()
Out[6]:
Id               1460
MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
                 ... 
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
SalePrice         663
Length: 81, dtype: int64

NOW WE SEPARATE THE NUMERICAL DATA FROM CATEGORICAL DATA AND FIND THE MISSING VALUE IN NUMERICAL DATA AND FILL THAT VALUE WITH NUMBER.¶

In [7]:
data_num=data.select_dtypes(include=np.number)
In [8]:
data_num
Out[8]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
0 1 60 65.0 8450 7 5 2003 2003 196.0 706 ... 0 61 0 0 0 0 0 2 2008 208500
1 2 20 80.0 9600 6 8 1976 1976 0.0 978 ... 298 0 0 0 0 0 0 5 2007 181500
2 3 60 68.0 11250 7 5 2001 2002 162.0 486 ... 0 42 0 0 0 0 0 9 2008 223500
3 4 70 60.0 9550 7 5 1915 1970 0.0 216 ... 0 35 272 0 0 0 0 2 2006 140000
4 5 60 84.0 14260 8 5 2000 2000 350.0 655 ... 192 84 0 0 0 0 0 12 2008 250000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 1456 60 62.0 7917 6 5 1999 2000 0.0 0 ... 0 40 0 0 0 0 0 8 2007 175000
1456 1457 20 85.0 13175 6 6 1978 1988 119.0 790 ... 349 0 0 0 0 0 0 2 2010 210000
1457 1458 70 66.0 9042 7 9 1941 2006 0.0 275 ... 0 60 0 0 0 0 2500 5 2010 266500
1458 1459 20 68.0 9717 5 6 1950 1996 0.0 49 ... 366 0 112 0 0 0 0 4 2010 142125
1459 1460 20 75.0 9937 5 6 1965 1965 0.0 830 ... 736 68 0 0 0 0 0 6 2008 147500

1460 rows × 38 columns

In [9]:
data_num.isnull().sum()
Out[9]:
Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchebvGr         0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

so firstly we have done the missing value treatment for "LOT FRONTAGE" data and after that for "GARAGEYRBLT"¶

In [10]:
data_num['LotFrontage'].fillna(data_num['LotFrontage'].median(),inplace=True)
In [11]:
data_num['GarageYrBlt'].fillna(data_num['GarageYrBlt'].median(),inplace=True)
In [12]:
data_num.isnull().sum()
Out[12]:
Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       8
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchebvGr       0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

so now we have just 8 null value in numerical data.this is less then 5% so we can drop that value directly¶

In [13]:
data_num.dropna(inplace=True)
In [14]:
data_num.isnull().sum()
Out[14]:
Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchebvGr       0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

with this we have fill all the null value with median method now there is no null value in the numerical data¶

In [15]:
# Identify the skewness and distribution
data_num["LotFrontage"].plot(kind="kde")
Out[15]:
<AxesSubplot:ylabel='Density'>

Identify significant variables using a correlation matrix¶

In [16]:
data_num.columns
Out[16]:
Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchebvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')
In [17]:
num_col=['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchebvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice']
In [18]:
# Correlation Matrix formation
corr_matrix =data_num.loc[:,num_col].corr()
print(corr_matrix)
                     Id  MSSubClass  LotFrontage   LotArea  OverallQual  \
Id             1.000000    0.011687    -0.011431 -0.032844    -0.032883   
MSSubClass     0.011687    1.000000    -0.354975 -0.138054     0.034491   
LotFrontage   -0.011431   -0.354975     1.000000  0.304684     0.233147   
LotArea       -0.032844   -0.138054     0.304684  1.000000     0.106324   
OverallQual   -0.032883    0.034491     0.233147  0.106324     1.000000   
OverallCond    0.013029   -0.061330    -0.053072 -0.002269    -0.090628   
YearBuilt     -0.015129    0.028397     0.116424  0.015639     0.571111   
YearRemodAdd  -0.024224    0.041047     0.082958  0.015126     0.549573   
MasVnrArea    -0.050298    0.022936     0.179459  0.104160     0.411876   
BsmtFinSF1    -0.007242   -0.069575     0.215610  0.213063     0.236823   
BsmtFinSF2    -0.005516   -0.066137     0.042781  0.111686    -0.058039   
BsmtUnfSF     -0.008274   -0.138922     0.121872 -0.004227     0.309602   
TotalBsmtSF   -0.017912   -0.236906     0.362862  0.258409     0.537122   
1stFlrSF       0.008684   -0.250050     0.414458  0.295919     0.476936   
2ndFlrSF       0.007333    0.308104     0.073386  0.052935     0.298543   
LowQualFinSF  -0.044125    0.046413     0.037645  0.004904    -0.029998   
GrLivArea      0.008356    0.076930     0.368004  0.261159     0.594417   
BsmtFullBath   0.001030    0.003807     0.091245  0.157702     0.108505   
BsmtHalfBath  -0.019809   -0.002633    -0.006822  0.048377    -0.039207   
FullBath       0.005673    0.136306     0.179172  0.122457     0.552266   
HalfBath       0.005652    0.176165     0.047926  0.016290     0.271466   
BedroomAbvGr   0.041511   -0.021651     0.236915  0.117778     0.105900   
KitchebvGr     0.004806    0.286572    -0.004674 -0.024697    -0.184642   
TotRmsAbvGrd   0.029185    0.042406     0.320213  0.187990     0.430549   
Fireplaces    -0.017536   -0.044466     0.233661  0.269643     0.400398   
GarageYrBlt   -0.002362    0.082333     0.062494 -0.025139     0.512611   
GarageCars     0.014997   -0.039043     0.268705  0.154739     0.599734   
GarageArea     0.015399   -0.098141     0.323132  0.180778     0.560543   
WoodDeckSF    -0.028146   -0.012634     0.074498  0.173167     0.240652   
OpenPorchSF   -0.006176   -0.005462     0.134672  0.086301     0.303482   
EnclosedPorch  0.004554   -0.010571     0.010692 -0.023094    -0.112950   
3SsnPorch     -0.046541   -0.044049     0.062176  0.020574     0.031029   
ScreenPorch    0.001769   -0.026414     0.037946  0.043511     0.066403   
PoolArea       0.057268    0.008214     0.181292  0.077888     0.065743   
MiscVal       -0.006114   -0.007805    -0.000192  0.038226    -0.031129   
MoSold         0.018962   -0.013840     0.009735  0.003203     0.068760   
YrSold         0.002776   -0.021529     0.007324 -0.012977    -0.025186   
SalePrice     -0.025343   -0.082813     0.333184  0.264674     0.789997   

               OverallCond  YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  \
Id                0.013029  -0.015129     -0.024224   -0.050298   -0.007242   
MSSubClass       -0.061330   0.028397      0.041047    0.022936   -0.069575   
LotFrontage      -0.053072   0.116424      0.082958    0.179459    0.215610   
LotArea          -0.002269   0.015639      0.015126    0.104160    0.213063   
OverallQual      -0.090628   0.571111      0.549573    0.411876    0.236823   
OverallCond       1.000000  -0.376763      0.075121   -0.128101   -0.041927   
YearBuilt        -0.376763   1.000000      0.590674    0.315707    0.249239   
YearRemodAdd      0.075121   0.590674      1.000000    0.179618    0.127609   
MasVnrArea       -0.128101   0.315707      0.179618    1.000000    0.264736   
BsmtFinSF1       -0.041927   0.249239      0.127609    0.264736    1.000000   
BsmtFinSF2        0.039333  -0.047816     -0.066672   -0.072319   -0.049287   
BsmtUnfSF        -0.136934   0.149810      0.181828    0.114442   -0.496137   
TotalBsmtSF      -0.167230   0.392562      0.291492    0.363936    0.520533   
1stFlrSF         -0.138814   0.284570      0.242488    0.344501    0.443232   
2ndFlrSF          0.027473   0.009566      0.140225    0.174561   -0.135715   
LowQualFinSF      0.025140  -0.183749     -0.062045   -0.069071   -0.064345   
GrLivArea        -0.076541   0.199343      0.288279    0.390857    0.206027   
BsmtFullBath     -0.051567   0.186305      0.118169    0.085310    0.647346   
BsmtHalfBath      0.117290  -0.037072     -0.011312    0.026673    0.068611   
FullBath         -0.190396   0.469625      0.440329    0.276833    0.055808   
HalfBath         -0.061434   0.240417      0.181063    0.201444    0.001952   
BedroomAbvGr      0.014274  -0.068619     -0.038429    0.102821   -0.105691   
KitchebvGr       -0.081254  -0.173951     -0.148527   -0.037610   -0.086473   
TotRmsAbvGrd     -0.055964   0.097440      0.193988    0.280682    0.044074   
Fireplaces       -0.020120   0.150148      0.114806    0.249070    0.258300   
GarageYrBlt      -0.306149   0.775884      0.614468    0.248346    0.147402   
GarageCars       -0.184866   0.537492      0.419815    0.364204    0.222241   
GarageArea       -0.151062   0.478439      0.370674    0.373066    0.295493   
WoodDeckSF       -0.004530   0.226891      0.207464    0.159718    0.205350   
OpenPorchSF      -0.031172   0.185081      0.223491    0.125703    0.107696   
EnclosedPorch     0.074731  -0.386839     -0.192367   -0.110204   -0.105608   
3SsnPorch         0.025163   0.032037      0.045907    0.018796    0.026995   
ScreenPorch       0.054016  -0.049169     -0.037656    0.061466    0.063299   
PoolArea         -0.002229   0.005310      0.006145    0.011723    0.141361   
MiscVal           0.068642  -0.034048     -0.009927   -0.029815    0.003910   
MoSold           -0.004034   0.009362      0.018588   -0.005965   -0.016053   
YrSold            0.043433  -0.014441      0.035352   -0.008201    0.016870   
SalePrice        -0.076294   0.522896      0.507158    0.477493    0.383977   

               ...  WoodDeckSF  OpenPorchSF  EnclosedPorch  3SsnPorch  \
Id             ...   -0.028146    -0.006176       0.004554  -0.046541   
MSSubClass     ...   -0.012634    -0.005462      -0.010571  -0.044049   
LotFrontage    ...    0.074498     0.134672       0.010692   0.062176   
LotArea        ...    0.173167     0.086301      -0.023094   0.020574   
OverallQual    ...    0.240652     0.303482      -0.112950   0.031029   
OverallCond    ...   -0.004530    -0.031172       0.074731   0.025163   
YearBuilt      ...    0.226891     0.185081      -0.386839   0.032037   
YearRemodAdd   ...    0.207464     0.223491      -0.192367   0.045907   
MasVnrArea     ...    0.159718     0.125703      -0.110204   0.018796   
BsmtFinSF1     ...    0.205350     0.107696      -0.105608   0.026995   
BsmtFinSF2     ...    0.067673     0.004294       0.036749  -0.030186   
BsmtUnfSF      ...   -0.004192     0.130217      -0.003684   0.020857   
TotalBsmtSF    ...    0.234182     0.244914      -0.099915   0.037960   
1stFlrSF       ...    0.238699     0.210625      -0.072610   0.056901   
2ndFlrSF       ...    0.090962     0.210512       0.064217  -0.024422   
LowQualFinSF   ...   -0.025669     0.018852       0.061314  -0.004373   
GrLivArea      ...    0.247981     0.330795       0.005813   0.021000   
BsmtFullBath   ...    0.175778     0.063937      -0.051483   0.000296   
BsmtHalfBath   ...    0.039929    -0.024489      -0.008518   0.034966   
FullBath       ...    0.189982     0.261509      -0.120246   0.036004   
HalfBath       ...    0.107275     0.196968      -0.093258  -0.004679   
BedroomAbvGr   ...    0.045614     0.098687       0.038447  -0.024667   
KitchebvGr     ...   -0.088863    -0.067892       0.028587  -0.024534   
TotRmsAbvGrd   ...    0.165236     0.237234       0.000861  -0.006657   
Fireplaces     ...    0.198180     0.170942      -0.029461   0.011447   
GarageYrBlt    ...    0.221241     0.214768      -0.284534   0.023803   
GarageCars     ...    0.226669     0.211257      -0.151857   0.036116   
GarageArea     ...    0.225418     0.238895      -0.121603   0.035410   
WoodDeckSF     ...    1.000000     0.058911      -0.125486  -0.033008   
OpenPorchSF    ...    0.058911     1.000000      -0.090870  -0.005401   
EnclosedPorch  ...   -0.125486    -0.090870       1.000000  -0.037395   
3SsnPorch      ...   -0.033008    -0.005401      -0.037395   1.000000   
ScreenPorch    ...   -0.074740     0.075865      -0.083074  -0.031617   
PoolArea       ...    0.073454     0.061403       0.054397  -0.008036   
MiscVal        ...   -0.009694    -0.018335       0.018445   0.000298   
MoSold         ...    0.021789     0.068538      -0.025830   0.029761   
YrSold         ...    0.021575    -0.055585      -0.008496   0.018714   
SalePrice      ...    0.324650     0.311268      -0.128778   0.045247   

               ScreenPorch  PoolArea   MiscVal    MoSold    YrSold  SalePrice  
Id                0.001769  0.057268 -0.006114  0.018962  0.002776  -0.025343  
MSSubClass       -0.026414  0.008214 -0.007805 -0.013840 -0.021529  -0.082813  
LotFrontage       0.037946  0.181292 -0.000192  0.009735  0.007324   0.333184  
LotArea           0.043511  0.077888  0.038226  0.003203 -0.012977   0.264674  
OverallQual       0.066403  0.065743 -0.031129  0.068760 -0.025186   0.789997  
OverallCond       0.054016 -0.002229  0.068642 -0.004034  0.043433  -0.076294  
YearBuilt        -0.049169  0.005310 -0.034048  0.009362 -0.014441   0.522896  
YearRemodAdd     -0.037656  0.006145 -0.009927  0.018588  0.035352   0.507158  
MasVnrArea        0.061466  0.011723 -0.029815 -0.005965 -0.008201   0.477493  
BsmtFinSF1        0.063299  0.141361  0.003910 -0.016053  0.016870   0.383977  
BsmtFinSF2        0.088480  0.041610  0.004802 -0.014878  0.031851  -0.010316  
BsmtUnfSF        -0.012506 -0.035146 -0.023857  0.033432 -0.040377   0.215740  
TotalBsmtSF       0.085831  0.126820 -0.018237  0.011558 -0.011451   0.612971  
1stFlrSF          0.090338  0.132669 -0.020931  0.031148 -0.009063   0.606849  
2ndFlrSF          0.040771  0.081749  0.016257  0.039782 -0.031893   0.322710  
LowQualFinSF      0.026627  0.062115 -0.003851 -0.022102 -0.028954  -0.025263  
GrLivArea         0.102489  0.170808 -0.002192  0.053792 -0.035801   0.710080  
BsmtFullBath      0.024157  0.068057 -0.022813 -0.024940  0.067489   0.225027  
BsmtHalfBath      0.031774  0.019937 -0.007484  0.033352 -0.046571  -0.015993  
FullBath         -0.006959  0.050103 -0.013964  0.058944 -0.019985   0.562491  
HalfBath          0.073391  0.022636  0.001528 -0.008772 -0.010056   0.282040  
BedroomAbvGr      0.044270  0.070928  0.007728  0.052450 -0.038584   0.171934  
KitchebvGr       -0.051430 -0.014485  0.062926  0.031032  0.033943  -0.137419  
TotRmsAbvGrd      0.059632  0.083979  0.024853  0.041611 -0.034886   0.536311  
Fireplaces        0.185752  0.095602  0.001518  0.052030 -0.024917   0.468930  
GarageYrBlt      -0.075017 -0.014421 -0.031418  0.002013 -0.001371   0.466247  
GarageCars        0.051277  0.021140 -0.042900  0.039393 -0.038065   0.639686  
GarageArea        0.052130  0.061292 -0.027230  0.026719 -0.025754   0.622492  
WoodDeckSF       -0.074740  0.073454 -0.009694  0.021789  0.021575   0.324650  
OpenPorchSF       0.075865  0.061403 -0.018335  0.068538 -0.055585   0.311268  
EnclosedPorch    -0.083074  0.054397  0.018445 -0.025830 -0.008496  -0.128778  
3SsnPorch        -0.031617 -0.008036  0.000298  0.029761  0.018714   0.045247  
ScreenPorch       1.000000  0.051216  0.031822  0.023695  0.010786   0.113044  
PoolArea          0.051216  1.000000  0.029636 -0.033785 -0.059800   0.093109  
MiscVal           0.031822  0.029636  1.000000 -0.006400  0.004938  -0.020951  
MoSold            0.023695 -0.033785 -0.006400  1.000000 -0.145367   0.045136  
YrSold            0.010786 -0.059800  0.004938 -0.145367  1.000000  -0.026180  
SalePrice         0.113044  0.093109 -0.020951  0.045136 -0.026180   1.000000  

[38 rows x 38 columns]
In [19]:
#Using heatmap to visualize the correlation matrix
sns.heatmap(corr_matrix, annot=True)
Out[19]:
<AxesSubplot:>

PAIR PLOT¶

In [20]:
# Pair plot for distribution and density
sns.pairplot(data)
Out[20]:
<seaborn.axisgrid.PairGrid at 0x1a917590b20>
In [21]:
sns.pairplot(data[['LotFrontage', 'SalePrice']])
Out[21]:
<seaborn.axisgrid.PairGrid at 0x1a959a82fd0>

EDA of categorical variables¶

In [23]:
data_cat=data.select_dtypes(exclude=np.number)
data_cat
Out[23]:
MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 ... GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition
0 RL Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
1 RL Pave NaN Reg Lvl AllPub FR2 Gtl Veenker Feedr ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
2 RL Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
3 RL Pave NaN IR1 Lvl AllPub Corner Gtl Crawfor Norm ... Detchd Unf TA TA Y NaN NaN NaN WD Abnorml
4 RL Pave NaN IR1 Lvl AllPub FR2 Gtl NoRidge Norm ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 RL Pave NaN Reg Lvl AllPub Inside Gtl Gilbert Norm ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
1456 RL Pave NaN Reg Lvl AllPub Inside Gtl NWAmes Norm ... Attchd Unf TA TA Y NaN MnPrv NaN WD Normal
1457 RL Pave NaN Reg Lvl AllPub Inside Gtl Crawfor Norm ... Attchd RFn TA TA Y NaN GdPrv Shed WD Normal
1458 RL Pave NaN Reg Lvl AllPub Inside Gtl mes Norm ... Attchd Unf TA TA Y NaN NaN NaN WD Normal
1459 RL Pave NaN Reg Lvl AllPub Inside Gtl Edwards Norm ... Attchd Fin TA TA Y NaN NaN NaN WD Normal

1460 rows × 43 columns

NOW WITH THIS WE HAVE SEPARATED THE CATEGORICAL VALUE AND NOW FIND THE MISSING VALUE IN THIS AND REPLACE IT¶

In [24]:
data_cat.isnull().sum()
Out[24]:
MSZoning            0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinType2       38
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         0
Functiol            0
FireplaceQu       690
GarageType         81
GarageFinish       81
GarageQual         81
GarageCond         81
PavedDrive          0
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
dtype: int64

now we have to do the missing value treatment for categorical value with "MODE METHOD"¶

In [25]:
data_cat['Alley'].mode()
Out[25]:
0    Grvl
Name: Alley, dtype: object
In [26]:
data_cat['Alley'].fillna(data_cat['Alley'].mode()[0],inplace=True)
In [27]:
data_cat['MiscFeature'].mode()
Out[27]:
0    Shed
Name: MiscFeature, dtype: object
In [28]:
data_cat['MiscFeature'].fillna(data_cat['MiscFeature'].mode()[0],inplace=True)
In [29]:
data_cat['Fence'].mode()
Out[29]:
0    MnPrv
Name: Fence, dtype: object
In [30]:
data_cat['Fence'].fillna(data_cat['Fence'].mode()[0],inplace=True)
In [31]:
data_cat['PoolQC'].mode()
Out[31]:
0    Gd
Name: PoolQC, dtype: object
In [32]:
data_cat['PoolQC'].fillna(data_cat['PoolQC'].mode()[0],inplace=True)
In [33]:
data_cat['FireplaceQu'].mode()
Out[33]:
0    Gd
Name: FireplaceQu, dtype: object
In [34]:
data_cat['FireplaceQu'].fillna(data_cat['FireplaceQu'].mode()[0],inplace=True)
In [35]:
data_cat['GarageType'].mode()
Out[35]:
0    Attchd
Name: GarageType, dtype: object
In [36]:
data_cat['GarageType'].fillna(data_cat['GarageType'].mode()[0],inplace=True)
In [37]:
data_cat['GarageFinish'].mode()
Out[37]:
0    Unf
Name: GarageFinish, dtype: object
In [38]:
data_cat['GarageFinish'].fillna(data_cat['GarageFinish'].mode()[0],inplace=True)
In [39]:
data_cat['GarageQual'].mode()
Out[39]:
0    TA
Name: GarageQual, dtype: object
In [40]:
data_cat['GarageQual'].fillna(data_cat['GarageQual'].mode()[0],inplace=True)
In [41]:
data_cat['GarageCond'].mode()
Out[41]:
0    TA
Name: GarageCond, dtype: object
In [42]:
data_cat['GarageCond'].fillna(data_cat['GarageCond'].mode()[0],inplace=True)
In [43]:
data_cat.isnull().sum()
Out[43]:
MSZoning          0
Street            0
Alley             0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType        8
ExterQual         0
ExterCond         0
Foundation        0
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Heating           0
HeatingQC         0
CentralAir        0
Electrical        1
KitchenQual       0
Functiol          0
FireplaceQu       0
GarageType        0
GarageFinish      0
GarageQual        0
GarageCond        0
PavedDrive        0
PoolQC            0
Fence             0
MiscFeature       0
SaleType          0
SaleCondition     0
dtype: int64

so with this we have fill the categorical null values with mode method now rest of the categorical null value is less then 5% so now we can directly drop that value¶

In [44]:
data_cat.dropna(inplace=True)
In [45]:
data_cat.isnull().sum()
Out[45]:
MSZoning         0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functiol         0
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
PoolQC           0
Fence            0
MiscFeature      0
SaleType         0
SaleCondition    0
dtype: int64

Count plot and box plot for bivariate analysis¶

In [46]:
sns.boxplot(data=data, x="SalePrice", y="SaleCondition")
Out[46]:
<AxesSubplot:xlabel='SalePrice', ylabel='SaleCondition'>
In [47]:
sns.countplot(x=data["LotShape"],hue=data['LotShape'])
plt.title("Feature Engineering")
Out[47]:
Text(0.5, 1.0, 'Feature Engineering')

now we have to combine the data_num and data_cat with the help of concat function¶

In [48]:
concat_df=pd.concat([data_cat,data_num])
concat_df
Out[48]:
MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
0 RL Pave Grvl Reg Lvl AllPub Inside Gtl CollgCr Norm ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 RL Pave Grvl Reg Lvl AllPub FR2 Gtl Veenker Feedr ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 RL Pave Grvl IR1 Lvl AllPub Inside Gtl CollgCr Norm ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 RL Pave Grvl IR1 Lvl AllPub Corner Gtl Crawfor Norm ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 RL Pave Grvl IR1 Lvl AllPub FR2 Gtl NoRidge Norm ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 0.0 40.0 0.0 0.0 0.0 0.0 0.0 8.0 2007.0 175000.0
1456 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 349.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 2010.0 210000.0
1457 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 0.0 60.0 0.0 0.0 0.0 0.0 2500.0 5.0 2010.0 266500.0
1458 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 366.0 0.0 112.0 0.0 0.0 0.0 0.0 4.0 2010.0 142125.0
1459 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 736.0 68.0 0.0 0.0 0.0 0.0 0.0 6.0 2008.0 147500.0

2864 rows × 81 columns

In [49]:
sns.boxplot(data=concat_df, x="SalePrice")
Out[49]:
<AxesSubplot:xlabel='SalePrice'>
In [ ]: